Load and transform the data for every subject

First, let’s load the Power 2011 region database. This will be used as an “atlas” throughout, to guide the development of the regions.

power2011 <- read_csv("../bin/power_2011.csv", 
                      col_types = cols(ROI=col_double(),
                                       X = col_double(),
                                       Y = col_double(),
                                       Z = col_double(),
                                       Network = col_double(),
                                       Color = col_character(),
                                       NetworkName = col_character())) %>%
  dplyr::select(ROI, X, Y, Z, Network, Color, NetworkName)

Create the Group-Level Regressor Matrix \(X\)

We now need to load the group level data. In essence, to corresponds to create a matrix X in which every individual is a row and every columns is a different ROI-to-ROI connection.

NOFLY <- c()
SUBJS <- c()
cols <- outer(power2011$ROI, power2011$ROI, function(x, y) {paste(x, y, sep="-")})
cols %<>% as.vector

connection <- function(x, y) {
  paste(min(x, y), max(x, y), sep="-")
}

vconnection <- Vectorize(connection)

Mode <- function(x, na.rm=F) {
  if (na.rm) {
    x = x[!is.na(x)]
  }
  ux <- unique(x)
  return(ux[which.max(tabulate(match(x, ux)))])
}

reduced_power2011 <- power2011 %>% 
  dplyr::select(Network, NetworkName) %>%
  group_by(Network) %>%
  summarize(Network = mean(Network), NetworkName = Mode(NetworkName))

connection_name <- function(x, y) {
  first <- min(x, y)
  second <- max(x, y)
  paste(reduced_power2011 %>% filter(Network == first) %>% dplyr::select(NetworkName) ,
        reduced_power2011 %>% filter(Network == second) %>% dplyr::select(NetworkName),
        sep="-")
  
}

vconnection_name <- Vectorize(connection_name)

connection_name2 <- function(x, y) {
  first <- min(x, y)
  second <- max(x, y)
  paste(reduced_power2011$NetworkName[reduced_power2011$Network == first],
        reduced_power2011$NetworkName[reduced_power2011$Network == second],
        sep="-")
  
}

vconnection_name2 <- Vectorize(connection_name2)


nets <- outer(power2011$Network, power2011$Network, vconnection)
nets %<>% as.vector
netnames <- outer(power2011$Network, power2011$Network, vconnection_name2)
netnames %<>% as.vector

n <- length(grep("sub-*", dir("../rsfmri/REST1")))
C <- matrix(data = rep(0, length(cols)*n), nrow =  n)

j <- 1

R <- NULL
PR <- NULL

for (sub in dir("../rsfmri/REST1")[grep("sub-*", dir("../rsfmri/REST1"))]) {
  SUBJS %<>% c(strsplit(sub, "-")[[1]][2])
  M <- paste("../rsfmri/REST1", 
             sub, 
             "ses-01/mr_pcorr.txt", sep="/") %>%
    read_csv(skip = 1,
             col_names = F,
             col_types = cols(
               .default = col_double(),
               X1 = col_character()
             )) %>%
    as.matrix() 
  v <- as_vector(M[,2:265])  # v spreads M column-wise. M is symmetrical, so it should not matter, but better not risk it
  C[j,] <- v
  if (length(v[is.na(v)]) > 0) {
    print(paste("NA detected in sub", sub))
    NOFLY %<>% c(sub)  # Addes sub to NOFLY list
  }
  
  j <- j + 1
}

Define the Networks

If we want, we can restrict the analysis only to a limited set of networks (and their cross-network connections) by modifying the NOI (Networks of Interest) variable. The variable will be used to create a second list, COI (Connections of interest), which will contain the possible list of network-to-network connections for the networks in NOI. (This is currently not needed, since the \(X\) matrix is already restricted to meaningful connections).

NOI <- c(
  "Uncertain",
  "Sensory/somatomotor Hand",
  "Sensory/somatomotor Mouth",
  "Cingulo-opercular Task Control",
  "Auditory",
  "Default mode",
  "Memory retrieval?",
  "Ventral attention",
  "Visual",
  "Fronto-parietal Task Control",
  "Salience",
  "Subcortical",
  "Cerebellar",
  "Dorsal attention"
)

COI <- outer(NOI, 
             NOI, 
             function(x, y) {paste(x, y, sep="-")}) %>% as.vector()

Now, we need to remove some columns from the hyper-large X matrix, and define proper groupings for Lasso.

First, we ensure that the data in the connectivity matrix C is actually numeric.

C <- apply(C, 2, FUN=as.numeric)

Then, we create a set of “censor” vectors, each of each in a binary vector that has the same lenght as the columns of C. If the j-th element of the censor vector is TRUE, the corresponding column in C is kept, otherwise it is removed from the possible regressors.

The first censor vector simply removes the redundant columns (since the connectivity from A to B is the same as the connectivity of B to A) and the self-correlations:

censor <- outer(power2011$ROI, 
                power2011$ROI, 
                function(x, y) {x < y}) %>% as.vector()

The second censor vector removes unlikely functional connections: Those with a partial correlation value \(|r| < 0.05|\).

censor2 <- colMeans(C) %>% abs() > 0.05

Now, we combine the censor vectors in a tibble that contains all of the relevant information about each column in C.

order <- tibble(index = 1:length(nets), 
                network = nets, 
                network_names = netnames,
                connection = cols, 
                censor=censor,
                censor2 = censor2)
order %<>% arrange(network)

And we remove all entries for each a censor vector is FALSE (we also create a grouping factor G, in case in the future we want to use Group Lasso)

I <- order %>%
  filter(censor == TRUE) %>%
  filter(censor2 == TRUE) %>%
  filter(network_names %in% COI) %>%
  dplyr::select(index) 

G <- order %>%
  filter(censor == TRUE) %>%
  filter(network_names %in% COI) %>%
  dplyr::select(network) 
# G is the real grouping factor for Lasso!

As a last step, we create the “real” regressor matrix \(X\), which is the proper subset of \(C\) after removing all of the censored columns.

X <- C[,as_vector(I)]

Load the Dependent Variable \(Y\)

Now we need to load the dependent variable. In this case, it is a binary variable that determines which strategy model best fits the behavioral data of an individual, whether it is the “memory” strategy (\(Y = 1\)) or the “procedural” strategy (\(Y = 2\)).

dvs <- read_csv("../actr-models/model_output/MODELLogLikelihood.csv",
                col_types = cols(
                  .default = col_double(),
                  HCPID = col_character(),
                  best_model = col_character()
                )) %>% 
  dplyr::select(HCPID, best_model) %>%
  arrange(HCPID)

Now we select only the rows of \(X\) and the values of \(Y\) for which we have both rsfMRI and model data:

subjs_hcp <- paste(SUBJS, "fnca", sep="_")
common <- intersect(subjs_hcp, dvs$HCPID)
keep_X <- subjs_hcp %in% common
keep_Y <- dvs$HCPID %in% common
Y <- dvs$best_model[keep_Y]
X <- X[keep_X, ]

Finally, we transform the dependent variable \(Y\) into a binary numeric variable with values \((0, 1)\), so that we can use logistic regression.

Y <- as.numeric(as.factor(Y)) - 1

Quality and Characteristics of \(X\) and \(Y\)

Let’s do some visualization and analysis of our indepedenent and dependet variables, just to ensure there are no obvious problems.

Collinearity of Connectivity Regressors \(X\)

The regressors \(X\) is certainly multi-collinear; that is a consequence of having a large number of predictors \(p > n\), which, in turn, is one of the reasons why we are using Lasso. Too much collinearity, however, could be really bad and push Lasso towards selecting non-optimal regressors. To gather a sense of how much collinearity we have, we can plot the distribution of correlations among regressors:

corX <- cor(X)
distCor <- as_vector(corX[lower.tri(corX, diag = F)])
distTibble <- as_tibble(data.frame(R=distCor))

ggplot(distTibble, aes(x=R)) +
  geom_histogram(col="white", alpha=0.5, binwidth = 0.05) +
  theme_pander() +
  ylab("Number of Correlations") +
  xlab("Correlation Value") +
  ggtitle("Distribution of Correlation Values Between Regressors")

All in all, the collinearity is not that bad—all regressors are correlated at \(|r| < 0.25\), and most of them are correlated at \(|r| < 0.1\), with a peak at \(r = 0\).

Distribution of Classes

And now, let’s visualize the histogram of the dependent variable we are trying to predict:

dependent <- as_tibble(data.frame(strategy=Y))

ggplot(dependent, aes(x = factor(strategy), fill=as.factor(strategy))) +
  geom_bar(col="white", alpha=0.5, width = .5) +
  scale_fill_aaas() +
  xlab("Strategy") +
  ylab("Number of Participants") +
  ggtitle("Distribution of Strategy Use") +
  theme_pander() +
  theme(legend.position = "none")

Because the classes are not equally distributed, and participants are more likely to use the memory strategy (\(Y=0\)) than the procedural one (\(Y = 1\)), we would need to adjust the weights of our Lasso model.

Machine-Learning with Lasso

To analyzie the data, we will use Lasso, a statitical learning system based on penalyzed regression.

Weights

Most of the entries in our \(Y\) vector are coded as “0” (i.e., most poarticipants use the memory strategy), which creates an imbalance. We are going to create an appropriate set of weights so that the two classes are balanced.

W <- Y
W[W == 0] <- mean(Y)
W[W == 1] <- (1-mean(Y))

Defining the model

We can now define the lasso model. We will use the elastic net approach with \(\alpha = 0\) to generate a pure lasso model. The model will use a binomial (i.e., logistic) regression and will measure the cross-validation error as class misassignment.

fit <- glmnet(y = Y,
              x = X,
              alpha=1,
              weights = W,
              family = "binomial",
              type.measure = "class",
              standardize = T
)

To choose the optimal value of \(\lambda\) in Lasso, we will examine the cross-validation error during a LOO cross-validation.

fit.cv <- cv.glmnet(y = Y,
                    x = X,
                    alpha=1,
                    family = "binomial",
                    weights = W,
                    type.measure = "class",
                    standardize=T,
                    nfolds=length(Y),
                    grouped = F
)

Now, let’s look at the cross-validation error profile.

plot(fit.cv)

The profile has the characteristic U-shape or increasing curve, with more error as \(\lambda\) increases. As recommended by Tibishirani, we will select the “lambda 1SE” value, which is the largest \(\lambda\) value that does not differ by more tha 1 SE from the \(\lambda\) value that gives us the minimum cross validation error. This guarantees the maximum generalizability.

We can also visualize the profile of the beta weights of each connection for different values of \(\lambda\).

plot(fit, sub="Beta Values for Connectivity")

L1norm <- sum(abs(fit$beta[,which(fit$lambda==fit.cv$lambda.1se)]))
abline(v=L1norm, lwd=2, lty=2)

And now, plot prettier version

lasso_df <- as_tibble(data.frame(lambda=fit.cv$lambda, 
                                 error=fit.cv$cvm, 
                                 sd=fit.cv$cvsd))

ggplot(lasso_df, aes(x=lambda, y=error)) +
  geom_line(aes(col=error), lwd=2) +
  scale_color_viridis("Error", option = "plasma") +
  geom_ribbon(aes(ymin=error -sd, ymax=error + sd), alpha=0.2,fill="blue") +
  xlab(expression(lambda)) +
  ylab("Cross-Validation Error") +
  ggtitle(expression(paste(bold("Cross Validation Error Across "), lambda))) +
  geom_vline(xintercept = lasso_df$lambda[lasso_df$error==min(lasso_df$error)]) +
  theme_pander() +
  theme(legend.position="right")

The min \(\lambda_{min}\) is 0.0313777, The 1se min \(\lambda_{min}\) is 0.0313777

Examining the Predictive Connectome

Let’s have a better look at the relevant connections that survive the Lass penalty at the value of \(\lambda_{min} + 1 SE\). We start by saving these connections in a tibble, and saving the data on a file for later use.

betas <- fit$beta[, which(fit$lambda==fit.cv$lambda.1se)]
conn_betas <- as_tibble(data.frame(index=I$index, Beta=betas))
connectome <- order %>%
  filter(index %in% I$index) %>%
  inner_join(conn_betas) %>%
  dplyr::select(-censor2) %>%
  filter(Beta != 0) %>%
  arrange(index)

write_csv(connectome, file="strategy_mr.csv")
save(fit, fit.cv, X, Y, order, I, G, file="strategy_mr.RData")

Finally, we can visualize the table of connections

connectome %>%
  xtable() %>%
  kable(digits = 5) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
index network network_names connection censor Beta
8206 1-1 Sensory/somatomotor Hand-Sensory/somatomotor Hand 22-32 TRUE -1.89086
8207 1-1 Sensory/somatomotor Hand-Sensory/somatomotor Hand 23-32 TRUE 2.30505
9264 1-1 Sensory/somatomotor Hand-Sensory/somatomotor Hand 24-36 TRUE -2.03310
10063 1-1 Sensory/somatomotor Hand-Sensory/somatomotor Hand 31-39 TRUE 2.81201
16959 4-4 Auditory-Auditory 63-65 TRUE 0.53776
18017 4-4 Auditory-Auditory 65-69 TRUE 4.03346
18020 4-4 Auditory-Auditory 68-69 TRUE -0.26776
18261 2-4 Sensory/somatomotor Mouth-Auditory 45-70 TRUE 2.48728
18274 3-4 Cingulo-opercular Task Control-Auditory 58-70 TRUE 0.04146
18282 4-4 Auditory-Auditory 66-70 TRUE -0.11546
21995 -1-5 Uncertain-Default mode 83-84 TRUE 0.68047
24368 5-5 Default mode-Default mode 80-93 TRUE 1.69981
24906 5-5 Default mode-Default mode 90-95 TRUE -3.74493
24907 5-5 Default mode-Default mode 91-95 TRUE -5.51874
26222 5-5 Default mode-Default mode 86-100 TRUE -0.04432
28059 5-5 Default mode-Default mode 75-107 TRUE -1.74162
28323 5-5 Default mode-Default mode 75-108 TRUE -2.93711
31420 -1-5 Uncertain-Default mode 4-120 TRUE 0.37132
31534 5-5 Default mode-Default mode 118-120 TRUE 0.52838
32816 5-5 Default mode-Default mode 80-125 TRUE 4.46703
33124 5-5 Default mode-Default mode 124-126 TRUE -0.50586
34942 5-6 Default mode-Memory retrieval? 94-133 TRUE 1.66036
35205 5-6 Default mode-Memory retrieval? 93-134 TRUE -2.41717
35469 5-6 Default mode-Memory retrieval? 93-135 TRUE -1.37214
36007 5-5 Default mode-Default mode 103-137 TRUE -1.63355
37613 5-7 Default mode-Visual 125-143 TRUE 5.99955
40807 7-7 Visual-Visual 151-155 TRUE -2.08989
41075 7-7 Visual-Visual 155-156 TRUE -3.04218
41328 7-7 Visual-Visual 144-157 TRUE 0.52580
41864 7-7 Visual-Visual 152-159 TRUE 0.89705
42660 7-7 Visual-Visual 156-162 TRUE 1.58602
42927 7-7 Visual-Visual 159-163 TRUE -11.65413
43186 7-7 Visual-Visual 154-164 TRUE 2.06833
43458 7-7 Visual-Visual 162-165 TRUE 1.08723
44501 7-7 Visual-Visual 149-169 TRUE -0.83859
44514 7-7 Visual-Visual 162-169 TRUE -3.29812
45566 7-7 Visual-Visual 158-173 TRUE 1.09878
45810 8-11 Fronto-parietal Task Control-Ventral attention 138-174 TRUE 2.70591
47700 8-8 Fronto-parietal Task Control-Fronto-parietal Task Control 180-181 TRUE -1.06966
48220 -1-7 Uncertain-Visual 172-183 TRUE 0.97763
48759 -1–1 Uncertain-Uncertain 183-185 TRUE 0.31034
49015 8-8 Fronto-parietal Task Control-Fronto-parietal Task Control 175-186 TRUE -1.74089
50082 8-8 Fronto-parietal Task Control-Fronto-parietal Task Control 186-190 TRUE -4.20237
51142 8-8 Fronto-parietal Task Control-Fronto-parietal Task Control 190-194 TRUE -1.14423
52464 8-8 Fronto-parietal Task Control-Fronto-parietal Task Control 192-199 TRUE -2.95918
53356 1-9 Sensory/somatomotor Hand-Salience 28-203 TRUE 1.67191
53422 5-9 Default mode-Salience 94-203 TRUE 2.93233
54042 8-9 Fronto-parietal Task Control-Salience 186-205 TRUE -4.51978
54221 5-9 Default mode-Salience 101-206 TRUE -1.41648
55120 9-9 Salience-Salience 208-209 TRUE 6.91535
55500 3-9 Cingulo-opercular Task Control-Salience 60-211 TRUE 3.23332
55649 9-9 Salience-Salience 209-211 TRUE -1.12886
56708 9-9 Salience-Salience 212-215 TRUE -0.09215
57240 9-9 Salience-Salience 216-217 TRUE -0.91650
57770 9-9 Salience-Salience 218-219 TRUE -0.45811
61041 3-10 Cingulo-opercular Task Control-Subcortical 57-232 TRUE -0.28020
61745 10-10 Subcortical-Subcortical 233-234 TRUE -0.15582
62630 4-11 Auditory-Ventral attention 62-238 TRUE -2.21331
63756 -1-11 Uncertain-Ventral attention 132-242 TRUE -0.75419
64071 -1-13 Uncertain-Cerebellar 183-243 TRUE -4.11193
66780 -1-12 Uncertain-Dorsal attention 252-253 TRUE 3.47463
67242 1-8 Sensory/somatomotor Hand-Fronto-parietal Task Control 186-255 TRUE -0.60621
67260 1-9 Sensory/somatomotor Hand-Salience 204-255 TRUE -0.26895
67836 12-12 Dorsal attention-Dorsal attention 252-257 TRUE -0.14686
67838 -1-12 Uncertain-Dorsal attention 254-257 TRUE 0.00521
67873 1-12 Sensory/somatomotor Hand-Dorsal attention 25-258 TRUE 1.37378
68814 8-12 Fronto-parietal Task Control-Dorsal attention 174-261 TRUE -2.42184
69218 3-12 Cingulo-opercular Task Control-Dorsal attention 50-263 TRUE -2.01954
69461 1-12 Sensory/somatomotor Hand-Dorsal attention 29-264 TRUE 0.33782

And we can do some statistics. For example, which networks do these regions and connections belong to? Are they different from what would be expected from a random sample of connections from the connectome?

region_from <- function(s) {as.numeric(strsplit(s, "-")[[1]][1])}
region_to <- function(s) {as.numeric(strsplit(s, "-")[[1]][2])}

vregion_from <- Vectorize(region_from)
vregion_to <- Vectorize(region_to)

lROIs <- unique(c(vregion_from(connectome$connection),
                  vregion_to(connectome$connection)))

rois <- power2011[lROIs,]
roi_stats <- rois %>%
  group_by(NetworkName, Color) %>%
  summarise(N=length(Color)/length(lROIs)) %>%
  add_column(Source="Lasso")


subsetPower <- filter(power2011,
                      NetworkName %in% NOI)

noi_stats <- subsetPower %>%
  group_by(NetworkName, Color) %>%
  summarise(N=length(Color)/dim(subsetPower)[1]) %>%
  add_column(Source="Power")

total_stats <- rbind(roi_stats, noi_stats)

ggplot(total_stats, aes(x="", y=N, fill=NetworkName)) +
  geom_bar(stat = "identity", col="white", width=1) +
  facet_grid(~Source, labeller = label_both) +
  coord_polar("y", start=0) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 2L)) +
  #scale_fill_viridis(discrete = T) +
  scale_fill_ucscgb() +
  ylab("") +
  xlab("") +
  ggtitle("Distriution of Regions") +
  geom_text_repel(aes(label=percent(N, .01)), col="black",
            position=position_stack(vjust=1), size=3)+
  theme_pander() +
  guides(fill=guide_legend(ncol=2)) +
  theme(legend.position = "bottom")

There is no difference in the distribution of ROIs:

chisq.test(roi_stats$N*length(lROIs), p = noi_stats$N)
## 
##  Chi-squared test for given probabilities
## 
## data:  roi_stats$N * length(lROIs)
## X-squared = 13.267, df = 13, p-value = 0.4274

And now, let’s look at the sparsity

net_from <- function(s) {as.character(strsplit(s, "-")[[1]][1])}
net_to <- function(s) {as.character(strsplit(s, "-")[[1]][2])}

vnet_from <- Vectorize(net_from)
vnet_to <- Vectorize(net_to)

connectivity <- function(s) {
  if (net_from(s) == net_to(s)) {
    "Within"
  } else {
    "Between"
  }
}

vconnectivity <- Vectorize(connectivity)

coi <- order %>%
  filter(censor == TRUE) %>%
  filter(network_names %in% COI) 

coi$from <- vnet_from(coi$network_names)
coi$to <- vnet_to(coi$network_names)
coi$connection_type <- vconnectivity(coi$network_names)

coi_stats <- coi %>% 
  group_by(connection_type) %>% 
  summarise(N=length(index), P=length(index)/dim(coi)[1]) %>%
  add_column(Source = "Power et al. (2011)")
  

connectome$connection_type <- vconnectivity(connectome$network_names)
connectome_stats <- connectome %>%
  group_by(connection_type) %>% 
  summarise(N=length(index), P=length(index)/dim(connectome)[1]) %>%
  add_column(Source = "Lasso Analysis")
  

total_stats2 <- rbind(connectome_stats, coi_stats)

ggplot(total_stats2, aes(x="", y=P, fill=connection_type)) +
  geom_bar(stat = "identity", col="grey", width=1) +
  facet_grid(~Source, labeller = label_both) +
  coord_polar("y", start=0) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 2L)) +
  scale_fill_jama() +
  scale_color_jama() +
  ylab("") +
  xlab("") +
  ggtitle("Distribuction of Connectivity\n(Within/Between Networks)") +
  geom_text_repel(aes(label=percent(P, .1)), col="white",
            position=position_stack(vjust=1), size=3)+
  theme_pander() +
  theme(legend.position = "bottom")

The differenece in the two distributions is significant.

chisq.test(connectome_stats$N, p=coi_stats$P)
## 
##  Chi-squared test for given probabilities
## 
## data:  connectome_stats$N
## X-squared = 152.61, df = 1, p-value < 2.2e-16

Connectivity by Region

Finally, we can look at how many connectionseach region belongs to:

rois <- c(vregion_from(connectome$connection), vregion_to(connectome$connection))
rois_t <- tibble(roi = rois)
rois_t %>%
  group_by(roi) %>%
  summarize(N = length(roi)) %>%
  arrange(N) -> rois_dist

rois_dist %>%
  xtable() %>%
  kable(digits = 5) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
roi N
4 1
22 1
23 1
24 1
25 1
28 1
29 1
31 1
36 1
39 1
45 1
50 1
57 1
58 1
60 1
62 1
63 1
66 1
68 1
83 1
84 1
86 1
90 1
91 1
100 1
101 1
103 1
107 1
108 1
118 1
124 1
126 1
132 1
133 1
134 1
135 1
137 1
138 1
143 1
144 1
149 1
151 1
152 1
154 1
157 1
158 1
163 1
164 1
165 1
172 1
173 1
175 1
180 1
181 1
185 1
192 1
194 1
199 1
204 1
205 1
206 1
208 1
212 1
215 1
216 1
217 1
218 1
219 1
232 1
233 1
234 1
238 1
242 1
243 1
253 1
254 1
258 1
261 1
263 1
264 1
32 2
65 2
69 2
75 2
80 2
94 2
95 2
120 2
125 2
155 2
156 2
159 2
169 2
174 2
190 2
203 2
209 2
211 2
252 2
255 2
257 2
70 3
93 3
162 3
183 3
186 4

And visualize

ggplot(rois_dist, aes(x=N)) +
  geom_histogram(alpha=0.5, col="white", 
                 binwidth = 1) +
  ggtitle("Distribution of Number of Connections per ROI") +
  stat_bin(binwidth= 1, 
           geom="text", 
           aes(label = paste("N =", ..count..)), 
           vjust = -1) +
  ylim(0, 100) +
  theme_pander()

Cross Validated Predictions

How well is the model doing? To investigate this, we can hand-craft a Leave-One Out regression model and save the predicted values of rate of forgetting as well as the recorded beta weights.

dfX <- data.frame(cbind(Y, X[,betas != 0]))
numP <- ncol(X[, betas != 0])  
numO <- length(Y)
names(dfX) <- c("Y", paste("X", 1:numP, sep=""))

Yp <- rep(0, numO)  # Vector of zeros the size of Y 
Xe <- matrix(rep(0, numP * numO), 
             ncol = numP)  # Matrix of zeros the dimensions of X

for (i in seq(1, length(Y))) {
  subdfX <- dfX[-i,]
  lmod<-lm(Y ~ . + 1, as.data.frame(subdfX))
  
  yp <- predict(object=lmod, 
                newdata=dfX[i, 2:(numP + 1)])
  Yp[i] <- yp
  Xe[i,] <- lmod$coefficients[2:(numP + 1)]
}

Predicted vs. Observed

Now, let’s do a real predicted vs. observed graph:

wcomparison <- tibble(Observed = Y,
                      Predicted = Yp,
                      DiscretePredicted = ifelse(Yp < 0.5, 0, 1))
              
wcomparison %<>% mutate(Accuracy = ifelse(DiscretePredicted == Observed,
                                          "Correct", 
                                          "Misclassified"))

rval <- floor(100*cor(Y, Yp))/100

p <- ggplot(wcomparison, aes(x=Predicted, y=Observed, 
                             col=Accuracy)) +
  geom_point(size=4, alpha=0.6, 
             position= position_jitter(height = 0.02)) +
  geom_abline(intercept = 0, slope = 1, 
              col="red",
              linetype="dashed") +
  scale_color_d3() +
  theme_pander() +

  theme(legend.position = "right") +
  guides(col=guide_legend("Classification")) +
  coord_fixed(xlim=c(0, 1), ylim=c(0, 1)) +
  annotate("text", x=0.3, y=0.7,
           label=paste("r(",
                       length(Y),
                       ") = ",
                       rval,
                       ", p < 0.001",
                       sep="")) +
  ylab("Observed Strategy") +
  xlab("Predicted Strategy") +
  ggtitle("Decision Strategy:\nCross-Validation") +
  theme(legend.position = "bottom")
  
ggMarginal(p, 
           fill="grey", 
           alpha=0.75,
           type="density", #bins=13, 
           col="darkgrey",
           margins = "both")

ROC

And now, ROC to get the accuracy.

wcomparison %<>% mutate(ROCPrediction = if_else(Predicted < 0.5, 0, 1))

rocobj <- roc(wcomparison$Observed, wcomparison$ROCPrediction)

g <- ggroc(rocobj, col="red") +
  geom_point(aes(y=rocobj$sensitivities, x=rocobj$specificities), col="red", size=4, alpha=.5) +
  ggtitle("ROC Curve for Model") +
  xlab("Specificity (FPR)") + ylab("Sensitivity (TPR)") + 
  geom_segment(aes(x = 1, xend = 0, y = 0, yend = 1), color="grey", linetype="dashed") +
  theme_pander()

g

The final accuracy of the prediction model is 0.9290805

ROC Curve By Sliding Threshold

To assess the robustness of our threshold, we can plot a ROC curve with specifities and sensitivities for sliding values of the prediction threshold.

curve <- NULL

for (threshold in seq(0, 1, 0.01)) {
  subthreshold <- wcomparison %>%
    mutate(Prediction = ifelse(Predicted > 1, 1, Predicted)) %>%
    mutate(Prediction = ifelse(Prediction <= 0, 1e-204, Prediction)) %>%
    mutate(Prediction = ifelse(Prediction <= threshold, 0, 1)) %>%
    mutate(Accuracy = ifelse(Prediction == Observed, 1, 0)) %>%
    group_by(Observed) %>%
    summarise(Accuracy = mean(Accuracy))
  
  tnr <- subthreshold %>% 
    filter(Observed == 0) %>% 
    dplyr::select(Accuracy) %>%
    as.numeric()
  
  tpr <- subthreshold %>% 
    filter(Observed == 1) %>% 
    dplyr::select(Accuracy) %>%
    as.numeric()
  
  partial <- tibble(Threshold = threshold,
                    TNR = tnr,
                    TPR = tpr)
  if (is.null(curve)) {
    curve <- partial
  } else {
    curve <- rbind(curve, partial)
  }
}

And now, we can visualize the discrete ROC:

ggplot(arrange(curve, TPR), aes(x=TNR, y=TPR)) + 
  geom_point(size=2, col="red", alpha=0.5) + 
  geom_line(col="red") + 
  ylab("Sensitivity (True Positive Rate)") +
  xlab("Specificity (True Negative Rate)") +
  scale_x_reverse() +
  # ylim(0, 1) +
  # xlim(1, 0) +
  ggtitle("ROC Curve for Different Thresholds") +
  geom_abline(slope=1, intercept = 1, col="grey", linetype = "dashed") +
  theme_pander()

Stability of Estimated Beta Weights

And now, let’s visualize the beta weights of the connections

colnames(Xe) <- paste("Connection", 1:nrow(connectome), paste="")
wconnections <- as_tibble(Xe)
lconnections <- pivot_longer(wconnections, cols=colnames(Xe), 
                             names_to="Connection", values_to = "Beta")

connectome <- connectome %>% arrange(Beta)

ggplot(lconnections, aes(x = reorder(Connection, Beta), y = Beta)) +
  geom_point(aes(col=Beta), alpha=.5, 
             size=2,
             position = position_jitter(height = 0, width = 0.3)) +
  stat_summary(fun.data = "mean_sdl", geom="point", fill="black", alpha=1, size=1) +
  scale_color_gradient2(low = "dodgerblue",
                        mid = "wheat",
                        high = "red2",
                        midpoint = 0) +
  scale_x_discrete(labels = 
                     paste(connectome$network_names, 
                           " (", 
                           connectome$connection,
                           ")", sep="")) +
  ggtitle("Connection Weights\nAcross Cross-Validation") +
  ylab(expression(paste(beta, " value"))) +
  xlab("Connection") +
  geom_hline(yintercept = 0, col="grey") +
  stat_summary(fun.data = "mean_cl_boot", 
               col="black", geom="errorbar", width=1) +
  #scale_color_viridis(option="plasma", begin=0.2, end=0.9) +
  
  theme_pander() +
  theme(axis.text.y = element_text(angle=0, hjust=1),
        legend.position = "NA") +
  #ylim(-3, 3) +
  coord_flip()

Testing the validity of the Lasso model

Here, we will examine the quality of our Lasso model bu doing a series of tests.

Ablation test

In the ablation test, we remove all the connections with significant beta values, and check whether the results are still significant.

XX <- X[, conn_betas$Beta == 0]

fit_wo <- glmnet(y = Y,
                 x = XX,
                 alpha=1,
                 lambda = fit$lambda,
                 family = "binomial",
                 type.measure = "class",
                 weights = W,
                 standardize = T
)

fit_wo.cv <- cv.glmnet(y = Y,
                       x = XX,
                       alpha=1,
                       weights = W,
                       lambda = fit$lambda,
                       standardize=T,
                       type.measure = "class",
                       family = "binomial",
                       grouped=F,
                       nfolds=length(Y)
)

The model does converge, but its overall classification error is much higher.

plot(fit_wo, sub="Beta Values for Connectivity")

L1norm <- sum(abs(fit_wo$beta[,which(fit_wo$lambda==fit_wo.cv$lambda.1se)]))
abline(v=L1norm, lwd=2, lty=2)

It is useful to plot the two \(\lambda\)-curves (with and without the relevant connections) on the same plot.

lasso_df_wo <- tibble(lambda=fit_wo.cv$lambda, 
                   error=fit_wo.cv$cvm, 
                   sd=fit_wo.cv$cvsd)



lasso_df$Model <- "Full Model"
lasso_df_wo$Model <- "Without the Selected Connections"

lasso_uber <- rbind(lasso_df, lasso_df_wo)

ggplot(lasso_uber, aes(x = lambda, y = error, fill=Model)) +
  scale_color_d3() +
  scale_fill_d3()+
  geom_ribbon(aes(ymin = error - sd, 
                  ymax = error + sd), 
              alpha = 0.5,
              #fill="blue"
              ) +
  geom_line(aes(col=Model), lwd=2) +
  xlab(expression(lambda)) +
  ylab("Cross-Validation Error") +
  ggtitle(expression(paste(bold("Cross Validation Error Across "), lambda))) +
  geom_vline(xintercept = fit.cv$lambda.1se,
             linetype="dashed") +
  theme_pander() +
  theme(legend.position="bottom")

Variance Inflation Factor

Then, we examine the Variance Inflation Factor (VIF). To calculate the VIF, we need to first create a linear model of the factor effects:

dfX <- data.frame(cbind(Y, X[,betas != 0]))
mod<-lm(Y ~ . + 1, as.data.frame(dfX))

We can now calculate the VIF and turn the results into a tibble:

vifs <- vif(mod)
vifsT <- tibble(VIF = vifs)

And, finally, we can plot an histogram of the distribution of VIF values. VIFs values < 10 are considered non-collinear; VIFs values < 5 are great. All of our factors have VIF values that a re much smaller than 5, which implies that they are as close to a normal basis set as possible.

ggplot(vifsT, aes( x =VIF)) +
  geom_histogram(col="white", binwidth = 0.1, fill="blue", alpha=0.4) +
  theme_pander() +
  xlab("VIF Value") +
  ylab("Number of Predictors") +
  ggtitle("Distribution of Variance Inflation Factors")

Nested Cross-Validation

Standard LOOV cross-validation suffers from data leakage: Since the same data is use for both validation (hyper-parameter fitting, in this case, the \(\lambda\) parameter) and testing, success on testing is biased by the fact that validation was performed on the same sample.

The canonical way to handle this problem is to have separate train-validate-test groups. This, however, runs into the problem of dividing the data properly and having a reduced sample.

An alternative solutio is to do nested cross-validation. In this case, the data is analyzed with two LOOV loops: at every turn, one subject is left out as a test subject, and the remaining N-1 are used for training and validation. This group is then again splitting one participant out for valiation and the remaining N-2 for training. The inner loop is performed until the validation is complete, at which point we use the hyper-parameters that guaratee best generalization to test the model on the the original, left-out participant. The process is repeated for every participant.

Note that different model (an a different profile of \(\lambda\)) is generated for every participat. To obtaine the bestmodel, we select first select the \(\lambda_{1SE}\) value. If there are no regressors left (i.e., \(N(\beta \neq 0) = 0\)), we select \(\lambda_{min}\). If we still have no regressors, we select the largest value of \(\lambda\) for which at least one regressor is left \(\neq0\).

N <- length(Y)
P <- ncol(X)
betas <- matrix(rep(0, P * N), nrow = N)
Yp <- rep(0, N)
minF = 1
X <- atanh(X)
#dfX <- as.data.frame(cbind(Y, X))
for (i in 1:N) {
  Ytrain <- Y[-i]
  Xtrain <- X[-i,]
  Wtrain <- W[-i]
  # fit <- ncvreg
  fit <- glmnet(y = Ytrain,
                x = Xtrain,
                weights = Wtrain,
                alpha = 1,
                family = "binomial",
                type.measure ="class",
                standardize = T
  )
  
  fit.cv <- cv.glmnet(y = Ytrain,
                      x = Xtrain,
                      alpha=1,
                      weights=Wtrain,
                      #penalty="SCAD",
                      family = "binomial",
                      type.measure = "class",
                      standardize=T,
                      grouped=F,
                      #nfolds=5
                      nfolds=length(Ytrain)
  )
  
  lambda <- fit.cv$lambda.min
  nzero <- fit.cv$nzero[which(fit.cv$lambda == fit.cv$lambda.min)]
  
  if (fit.cv$nzero[which(fit.cv$lambda == fit.cv$lambda.1se)] > 0) {
    lambda <- fit.cv$lambda.1se
    nzero <- fit.cv$nzero[which(fit.cv$lambda == fit.cv$lambda.1se)]
  }
  
  if (nzero < minF) {
    # If no features, select a less-generalizable lambda
    lambda <- fit.cv$lambda[which(fit.cv$nzero >= minF)[1]]
  } 
  
  B <- fit$beta[,which(fit$lambda==lambda)]
  #B <- fit$beta[,which(fit$lambda==fit$lambda[60])]
  #print(B)
  #print(fit.cv$lambda.min)
  #plot(fit.cv)
  if (length(B[B!=0])) {
    #print(c(i, length(B[B!=0])))
    dfX <- data.frame(cbind(Y, X[, B != 0]))
    lmod<-lm(Y ~ . + 1, data=dfX[-i,])
    #print(lmod)
    #Xtest <- dfX[i,-1]
    yp <- lmod$coefficients[1] + sum(B*X[i,])
  } else {
    
    yp <- mean(Ytrain)
  }
  betas[i,] <- B
  Yp[i] <- yp
}

One of the problems of nested CV is that a different model is tested for each participant. To get a sense of the most predictive regressors, we can simply average across them:

uB <- colMeans(betas)
conn_betas <- as_tibble(data.frame(index=I$index, Beta=uB))
connectome_nested <- order %>%
  filter(index %in% I$index) %>%
  inner_join(conn_betas) %>%
  dplyr::select(-censor2) %>%
  filter(Beta != 0) %>%
  arrange(index)

write_csv(connectome_nested, file="strategy_mr_nestd.csv")
#load("./rlvsibl_analysis_lasso_mr_nested_followup_cache/cache_chunk62.RData")

Predicted vs. Observed

Now, let’s do a real predicted vs. observed graph:

wcomparison <- tibble(Observed = Y,
                      Predicted = Yp,
                      DiscretePredicted = ifelse(Yp < 0.5, 0, 1))
              
wcomparison %<>% mutate(Accuracy = ifelse(DiscretePredicted == Observed,
                                          "Correct", 
                                          "Misclassified"))

rval <- floor(100*cor(Y, Yp))/100

p <- ggplot(wcomparison, aes(x=Predicted, y=Observed, 
                             col=Accuracy)) +
  geom_point(size=4, alpha=0.6, 
             position= position_jitter(height = 0.02)) +
  geom_abline(intercept = 0, slope = 1, 
              col="red",
              linetype="dashed") +
  scale_color_d3() +
  theme_pander() +

  theme(legend.position = "right") +
  guides(col=guide_legend("Classification")) +
  coord_fixed(xlim=c(0, 1), ylim=c(0, 1)) +
  annotate("text", x=0.3, y=0.7,
           label=paste("r(",
                       length(Y),
                       ") = ",
                       rval,
                       ", p < 0.001",
                       sep="")) +
  ylab("Observed Strategy") +
  xlab("Predicted Strategy") +
  ggtitle("Decision Strategy:\nCross-Validation") +
  theme(legend.position = "bottom")
  
ggMarginal(p, 
           fill="grey", 
           alpha=0.75,
           type="density", #bins=13, 
           col="darkgrey",
           margins = "both")

ROC

And now, ROC to get the accuracy.

wcomparison %<>% mutate(ROCPrediction = if_else(Predicted < 0.5, 0, 1))

rocobj <- roc(wcomparison$Observed, wcomparison$ROCPrediction)

g <- ggroc(rocobj, col="red") +
  geom_point(aes(y=rocobj$sensitivities, x=rocobj$specificities), col="red", size=4, alpha=.5) +
  ggtitle("ROC Curve for Model") +
  xlab("Specificity (FPR)") + ylab("Sensitivity (TPR)") + 
  geom_segment(aes(x = 1, xend = 0, y = 0, yend = 1), color="grey", linetype="dashed") +
  theme_pander()

g

The final accuracy of the prediction model is 0.9290805

ROC Curve By Sliding Threshold

To assess the robustness of our threshold, we can plot a ROC curve with specifities and sensitivities for sliding values of the prediction threshold.

curve <- NULL

for (threshold in seq(0, 1, 0.01)) {
  subthreshold <- wcomparison %>%
    mutate(Prediction = ifelse(Predicted > 1, 1, Predicted)) %>%
    mutate(Prediction = ifelse(Prediction <= 0, 1e-204, Prediction)) %>%
    mutate(Prediction = ifelse(Prediction <= threshold, 0, 1)) %>%
    mutate(Accuracy = ifelse(Prediction == Observed, 1, 0)) %>%
    group_by(Observed) %>%
    summarise(Accuracy = mean(Accuracy))
  
  tnr <- subthreshold %>% 
    filter(Observed == 0) %>% 
    dplyr::select(Accuracy) %>%
    as.numeric()
  
  tpr <- subthreshold %>% 
    filter(Observed == 1) %>% 
    dplyr::select(Accuracy) %>%
    as.numeric()
  
  partial <- tibble(Threshold = threshold,
                    TNR = tnr,
                    TPR = tpr)
  if (is.null(curve)) {
    curve <- partial
  } else {
    curve <- rbind(curve, partial)
  }
}

And now, we can visualize the discrete ROC:

ggplot(arrange(curve, TPR), aes(x=TNR, y=TPR)) + 
  geom_point(size=2, col="red", alpha=0.5) + 
  geom_line(col="red") + 
  ylab("Sensitivity (True Positive Rate)") +
  xlab("Specificity (True Negative Rate)") +
  scale_x_reverse() +
  # ylim(0, 1) +
  # xlim(1, 0) +
  ggtitle("ROC Curve for Different Thresholds") +
  geom_abline(slope=1, intercept = 1, col="grey", linetype = "dashed") +
  theme_pander()


Quantatative Connectivity Analysis

We can look at whether there are difference in connectivity between individuals vs. procedural individuals using quantitative methods

Beta Distriburion by Region

# skip this block
connectome %>%
  #filter(connection_type != 'Within') %>%
  mutate(Group = if_else(Beta >= 0, 'Declarative', 'Procedural')) %>%
  group_by(network_names, connection_type, Group) %>%
  summarise(n= n()) %>%
  mutate(freq = n / sum(n)) %>%
  #arrange(desc(freq, Group, network_names)) %>%

  ggplot(aes(x=network_names, y=freq)) +
  geom_bar(aes(fill = Group), stat = "identity", width=.4, position = position_stack()) +
  facet_grid(network_names~.) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# only positive betas (declarative individuals)

# using 95% confidence interval
lowerBeta <- t.test(connectome$Beta, conf.level = .95)$conf.int[1]
upperBeta <- t.test(connectome$Beta, conf.level = .95)$conf.int[2]

# using 0
lowerBeta <- 0
upperBeta <- 0

connectome_beta_positive <- connectome %>% subset(Beta>upperBeta)
connectome_beta_negative <- connectome %>% subset(Beta<lowerBeta)

lROIs_beta_positive <- unique(c(vregion_from(connectome_beta_positive$connection),
                  vregion_to(connectome_beta_positive$connection)))

rois_beta_positive <- power2011[lROIs_beta_positive, ]

roi_stats_beta_positive <- rois_beta_positive %>%
  group_by(NetworkName, Color) %>%
  summarise(N=length(Color)/length(lROIs_beta_positive)) %>%
  # keep all network in
  right_join(power2011 %>% distinct(NetworkName, Color)) %>% 
  add_column(Source="beta_positive") %>%
  mutate(N = replace_na(N, 0)) %>%
  arrange(NetworkName)

#power2011 %>%
#  mutate(beta_positive = ROI %in% lROIs_beta_positive) %>%
#  group_by(NetworkName, Color, beta_positive) %>%
#  summarise(N=length(Color)/length(lROIs_beta_positive)) %>%
#  add_column(Source="beta_positive") %>%
#  filter(beta_positive==T)
lROIs_beta_negative <- unique(c(vregion_from(connectome_beta_negative$connection),
                  vregion_to(connectome_beta_negative$connection)))
rois_beta_negative <- power2011[lROIs_beta_negative, ]

roi_stats_beta_negative <- rois_beta_negative %>%
  group_by(NetworkName, Color) %>%
  summarise(N=length(Color)/length(lROIs_beta_negative)) %>%
  # keep all network in
  right_join(power2011 %>% distinct(NetworkName, Color)) %>% 
  add_column(Source="beta_negative") %>%
  mutate(N = replace_na(N, 0)) %>%
  arrange(NetworkName)
  
noi_stats %>%
  rbind(roi_stats_beta_positive) %>%
  rbind(roi_stats_beta_negative) %>%
  ggplot(aes(x="", y=N, fill=NetworkName)) +
  geom_bar(stat = "identity", col="white", width=1) +
  facet_grid(~Source, labeller = label_both) +
  coord_polar("y", start=0) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 2L)) +
  #scale_fill_viridis(discrete = T) +
  scale_fill_ucscgb() +
  ylab("") +
  xlab("") +
  ggtitle("Distriution of Regions") +
  #geom_text_repel(aes(label=percent(N, .01)), col="black",
  #          position=position_stack(vjust=1), size=3)+
  theme_pander() +
  guides(fill=guide_legend(ncol=2)) +
  theme(legend.position = "bottom")

Looking at positive beta networks vs. negative beta network, there is no difference in ROI distribution

chisq.test(roi_stats_beta_positive$N*length(lROIs_beta_positive), p = noi_stats$N, simulate.p.value = T)
## 
##  Chi-squared test for given probabilities with simulated p-value (based
##  on 2000 replicates)
## 
## data:  roi_stats_beta_positive$N * length(lROIs_beta_positive)
## X-squared = 19.537, df = NA, p-value = 0.1069
chisq.test(roi_stats_beta_negative$N*length(lROIs_beta_negative), p = noi_stats$N, simulate.p.value = T)
## 
##  Chi-squared test for given probabilities with simulated p-value (based
##  on 2000 replicates)
## 
## data:  roi_stats_beta_negative$N * length(lROIs_beta_negative)
## X-squared = 18.314, df = NA, p-value = 0.1539
chisq.test(x = roi_stats_beta_positive$N*length(lROIs_beta_positive), 
           y = roi_stats_beta_negative$N*length(lROIs_beta_negative), 
           simulate.p.value = T)
## 
##  Pearson's Chi-squared test with simulated p-value (based on 2000
##  replicates)
## 
## data:  roi_stats_beta_positive$N * length(lROIs_beta_positive) and roi_stats_beta_negative$N * length(lROIs_beta_negative)
## X-squared = 53.667, df = NA, p-value = 0.7446
fisher.test(x = roi_stats_beta_positive$N*length(lROIs_beta_positive), 
           y = roi_stats_beta_negative$N*length(lROIs_beta_negative))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  roi_stats_beta_positive$N * length(lROIs_beta_positive) and roi_stats_beta_negative$N * length(lROIs_beta_negative)
## p-value = 0.6369
## alternative hypothesis: two.sided

Next, look at the between/within distribution

connectome_stats_beta_positive <- connectome_beta_positive %>%
  group_by(connection_type) %>% 
  summarise(N=length(index), P=length(index)/dim(connectome_beta_positive)[1]) %>%
  add_column(Source = "beta_positive")

connectome_stats_beta_negative <- connectome_beta_negative %>%
  group_by(connection_type) %>% 
  summarise(N=length(index), P=length(index)/dim(connectome_beta_negative)[1]) %>%
  add_column(Source = "beta_negative")
coi_stats %>%
  rbind(connectome_stats_beta_positive) %>%
  rbind(connectome_stats_beta_negative) %>%
  ggplot(aes(x="", y=P, fill=connection_type)) +
  geom_bar(stat = "identity", col="grey", width=1) +
  facet_grid(~Source, labeller = label_both) +
  coord_polar("y", start=0) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 2L)) +
  scale_fill_jama() +
  scale_color_jama() +
  ylab("") +
  xlab("") +
  ggtitle("Distribuction of Connectivity\n(Within/Between Networks)") +
  geom_text_repel(aes(label=percent(P, .1)), col="white",
            position=position_stack(vjust=1), size=3)+
  theme_pander() +
  theme(legend.position = "bottom")

The distribution is different from Power 2011, but there is no difference between declarative vs. procedural network

chisq.test(connectome_stats_beta_positive$N, p=coi_stats$P, simulate.p.value = T)
## 
##  Chi-squared test for given probabilities with simulated p-value (based
##  on 2000 replicates)
## 
## data:  connectome_stats_beta_positive$N
## X-squared = 55.863, df = NA, p-value = 0.0004998
chisq.test(connectome_stats_beta_negative$N, p=coi_stats$P, simulate.p.value = T)
## 
##  Chi-squared test for given probabilities with simulated p-value (based
##  on 2000 replicates)
## 
## data:  connectome_stats_beta_negative$N
## X-squared = 97.543, df = NA, p-value = 0.0004998
chisq.test(x = connectome_stats_beta_positive$N, p = connectome_stats_beta_negative$P, simulate.p.value = T)
## 
##  Chi-squared test for given probabilities with simulated p-value (based
##  on 2000 replicates)
## 
## data:  connectome_stats_beta_positive$N
## X-squared = 0.50134, df = NA, p-value = 0.5982
chisq.test(p = connectome_stats_beta_positive$P, x = connectome_stats_beta_negative$N, simulate.p.value = T)
## 
##  Chi-squared test for given probabilities with simulated p-value (based
##  on 2000 replicates)
## 
## data:  connectome_stats_beta_negative$N
## X-squared = 0.64103, df = NA, p-value = 0.5167

W Distribution by Region

Beta matrix B

# convert connectom to a 264*264 matrix
connectom2matrix <- function(connectome) {
  empty_mat <- matrix(0, 264, 264, dimnames = list(paste0("X", 1:264), paste0("X", 1:264))) 
  connectome <- connectome %>% 
  mutate(connection_from = as.integer(purrr::map_chr(connection, ~strsplit(., "-")[[1]][1])),
         connection_to = as.integer(purrr::map_chr(connection, ~strsplit(., "-")[[1]][2])))
  for (row in 1:nrow(connectome)) {
    Beta <- connectome[row, "Beta"][[1]]
    connection_from <- connectome[row, "connection_from"][[1]]
    connection_to <- connectome[row, "connection_to"][[1]]
    empty_mat[connection_from, connection_to] <- Beta
    #empty_mat[connection_to, connection_from] <- Beta  # make symmetric matrix
  }
  return(empty_mat)
}

connectom2Bmatrix <- function(connectome) {
  empty_mat <- matrix(0, 264, 264, dimnames = list(paste0("X", 1:264), paste0("X", 1:264))) 
  empty_mat[connectome$index] <- connectome$Beta
  return(empty_mat)
}
connectom2Wmatrix <- function(connectome) {
  empty_mat <- matrix(0, 264, 264, dimnames = list(paste0("X", 1:264), paste0("X", 1:264))) 
  empty_mat[connectome$index] <- connectome$W
  return(empty_mat)
}
connectom2Amatrix <- function(connectome) {
  empty_mat <- matrix(0, 264, 264, dimnames = list(paste0("X", 1:264), paste0("X", 1:264))) 
  empty_mat[connectome$index] <- connectome$A
  return(empty_mat)
}

# convert a 264*264 matrix back to connectom df
matrix2connectom <- function(mat, connectome, col_name) {
  connectome$temp = mat[connectome$index]
  connectome <- rename(connectome, !!col_name := temp)
  return(connectome)
}

# make the matrix symmetric
make_symmetric <- function(m) {
  # lower.tri is 0.0
  m[lower.tri(m)] <- t(m)[lower.tri(m)]
  return(m)
}

B_mat <- connectom2Bmatrix(connectome)
#sum(B_mat!=0)

Calculate averaged corr matrix across participants: A

The averaged correlation coefficients across subjects were calculated by first transforming each r value into a z-value, and then retransforming the average z value back into an equivalent r value using the hyperbolic tangent transformation (Silver & Dunlap, 1987).

Using N = 176xM = 622 dataframe, N=number of subjects; obtaining a result matrix A=264*264

Importantly, we used after filtered X functional conectivity matrix (178 x 622), rather than original one to calculate averaged matrix A

# z transform vector
X.flat <- matrix(0, nrow = dim(C)[1], ncol = dim(C)[2])
X.flat[,as_vector(I)] <- C[,as_vector(I)]
X.flat <- X.flat[keep_X, ]  # only keep subjects who have rsfc --> 176 subj

# calculate column mean
X.colMean <- colMeans(X.flat)

# back-transform to matrix 264*264 with tanh
X.mat <- matrix(X.colMean, nrow=264,ncol=264,byrow=FALSE) # already double-checked, correct matrix back-transform
A_mat <- tanh(X.mat)
#sum(A_mat!=0)

Group-level weighted matrix W

W_mat <- data.frame(A_mat * B_mat)
W_mat[lower.tri(W_mat)] <- t(W_mat)[lower.tri(W_mat)]

#typeof(W_mat)
sum(W_mat!=0)
## [1] 138
#which(W_mat!=0,arr.ind = T)

# column and row names 
colnames(W_mat) <- paste0("ROI",seq(1,264))
rownames(W_mat) <- paste0("ROI",seq(1,264))

Merge Beta, A, W

# W matrix
connectome.full <- connectome %>%
  merge(matrix2connectom(A_mat, connectome, "A"), by=colnames(connectome)) %>%
  merge(matrix2connectom(as.matrix(W_mat), connectome, "W"), by=colnames(connectome))

connectome_beta_positive <- connectome.full %>% subset(Beta>upperBeta)
connectome_beta_negative <- connectome.full %>% subset(Beta<lowerBeta)

# positive network in W
lROIs_beta_positive <- unique(c(vregion_from(connectome_beta_positive$connection),
                  vregion_to(connectome_beta_positive$connection)))

rois_beta_positive <- power2011[lROIs_beta_positive, ]

roi_stats_beta_positive <- rois_beta_positive %>%
  group_by(NetworkName, Color) %>%
  summarise(N=length(Color)/length(lROIs_beta_positive)) %>%
  # keep all network in
  right_join(power2011 %>% distinct(NetworkName, Color)) %>% 
  add_column(Source="W_positive") %>%
  mutate(N = replace_na(N, 0)) %>%
  arrange(NetworkName)

# negative network in W
lROIs_beta_negative <- unique(c(vregion_from(connectome_beta_negative$connection),
                  vregion_to(connectome_beta_negative$connection)))
rois_beta_negative <- power2011[lROIs_beta_negative, ]

roi_stats_beta_negative <- rois_beta_negative %>%
  group_by(NetworkName, Color) %>%
  summarise(N=length(Color)/length(lROIs_beta_negative)) %>%
  # keep all network in
  right_join(power2011 %>% distinct(NetworkName, Color)) %>% 
  add_column(Source="W_negative") %>%
  mutate(N = replace_na(N, 0)) %>%
  arrange(NetworkName)

Visualize distribution of W connections

noi_stats %>%
  rbind(roi_stats_beta_positive) %>%
  rbind(roi_stats_beta_negative) %>%
  ggplot(aes(x="", y=N, fill=NetworkName)) +
  geom_bar(stat = "identity", col="white", width=1) +
  facet_grid(~Source, labeller = label_both) +
  coord_polar("y", start=0) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 2L)) +
  #scale_fill_viridis(discrete = T) +
  scale_fill_ucscgb() +
  ylab("") +
  xlab("") +
  ggtitle("Distriution of Regions") +
  #geom_text_repel(aes(label=percent(N, .01)), col="black",
  #          position=position_stack(vjust=1), size=3)+
  theme_pander() +
  guides(fill=guide_legend(ncol=2)) +
  theme(legend.position = "bottom")

None of the tests show significant results

chisq.test(roi_stats_beta_positive$N*length(lROIs_beta_positive), p = noi_stats$N, simulate.p.value = TRUE)
## 
##  Chi-squared test for given probabilities with simulated p-value (based
##  on 2000 replicates)
## 
## data:  roi_stats_beta_positive$N * length(lROIs_beta_positive)
## X-squared = 19.537, df = NA, p-value = 0.1274
chisq.test(roi_stats_beta_negative$N*length(lROIs_beta_negative), p = noi_stats$N, simulate.p.value = TRUE)
## 
##  Chi-squared test for given probabilities with simulated p-value (based
##  on 2000 replicates)
## 
## data:  roi_stats_beta_negative$N * length(lROIs_beta_negative)
## X-squared = 18.314, df = NA, p-value = 0.1484
chisq.test(x = roi_stats_beta_positive$N*length(lROIs_beta_positive), 
           y = roi_stats_beta_negative$N*length(lROIs_beta_negative),
           simulate.p.value = TRUE)
## 
##  Pearson's Chi-squared test with simulated p-value (based on 2000
##  replicates)
## 
## data:  roi_stats_beta_positive$N * length(lROIs_beta_positive) and roi_stats_beta_negative$N * length(lROIs_beta_negative)
## X-squared = 53.667, df = NA, p-value = 0.7536
fisher.test(x = roi_stats_beta_positive$N*length(lROIs_beta_positive), 
           y = roi_stats_beta_negative$N*length(lROIs_beta_negative))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  roi_stats_beta_positive$N * length(lROIs_beta_positive) and roi_stats_beta_negative$N * length(lROIs_beta_negative)
## p-value = 0.6369
## alternative hypothesis: two.sided

The between/within distribution of W is exactly same as Beta matrix. This is not surprisingly because most of corr values in A matrix are positive, suggesting that the signs in W and B should be pretty much same.

connectome_stats_beta_positive <- connectome_beta_positive %>%
  group_by(connection_type) %>% 
  summarise(N=length(index), P=length(index)/dim(connectome_beta_positive)[1]) %>%
  add_column(Source = "W_positive")

connectome_stats_beta_negative <- connectome_beta_negative %>%
  group_by(connection_type) %>% 
  summarise(N=length(index), P=length(index)/dim(connectome_beta_negative)[1]) %>%
  add_column(Source = "W_negative")

coi_stats %>%
  rbind(connectome_stats_beta_positive) %>%
  rbind(connectome_stats_beta_negative) %>%
  ggplot(aes(x="", y=P, fill=connection_type)) +
  geom_bar(stat = "identity", col="grey", width=1) +
  facet_grid(~Source, labeller = label_both) +
  coord_polar("y", start=0) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 2L)) +
  scale_fill_jama() +
  scale_color_jama() +
  ylab("") +
  xlab("") +
  ggtitle("Distribuction of Connectivity\n(Within/Between Networks)") +
  geom_text_repel(aes(label=percent(P, .1)), col="white",
            position=position_stack(vjust=1), size=3)+
  theme_pander() +
  theme(legend.position = "bottom")


Graph Analysis

iGraph

connectome_network <- connectome.full %>%
  #dplyr::select(network) %>%
  mutate(new_network = str_replace_all(network, c("^-1"="0", "--1$"="-0"))) %>%
  separate(new_network, remove=F, sep = "-", into = c("network_from", "network_to")) %>%
  separate(connection, remove=F, sep = "-", into = c("connection_from", "connection_to")) %>%
  arrange(network_from, network_to)

# select cols
roi_links <- connectome_network %>%
  dplyr::select(connection_from, connection_to, W, network_names)
# rename cols
colnames(roi_links) <- c("from", "to", "weight", "type")

roi_nodes <- power2011 %>% rename(id = ROI) %>%
  mutate(NetworkName=as.factor(NetworkName), 
         Color=factor(Color),
         Network=ifelse(Network==-1, 0, Network))
levels(roi_nodes$Color) <- sample(colors(T), 14) 

# create a graph
net <- graph_from_data_frame(d=roi_links, vertices=roi_nodes, directed=F) 
#g <- graph_from_adjacency_matrix(W_mat,, mode = "upper")

Plot connection based on network color

plot(net, 
     # === vertex
      vertex.color=V(net)$Color, 
      vertex.frame.color = "gray85",
      vertex.shape="circle",
      vertex.size=igraph::degree(net) *10,
      #vertex.size2=NA,
     
     # === vertex label
      vertex.label=NA,                   # Character vector used to label the nodes
      vertex.label.color="black", #V(nnet)$colors,
      vertex.label.family="Arial",                  # Font family of the label (e.g.“Times”, “Helvetica”)
      vertex.label.font=2,                          # Font: 1 plain, 2 bold, 3, italic, 4 bold italic, 5 symbol
      vertex.label.cex=1,                           # Font size (multiplication factor, device-dependent)
      vertex.label.dist=2,                          # Distance between the label and the vertex
      vertex.label.degree=90, 
     
     # === Edge
      edge.color=c("dark red", "slategrey")[as.numeric(E(net)$weight<0)+1], # gray <0, red > 0                           # Edge color
      edge.width=abs(E(net)$weight)*10,            # Edge width, defaults to 1
      edge.arrow.size=1,                            # Arrow size, defaults to 1
      edge.arrow.width=NA,      # Arrow width, defaults to 1
      edge.lty="solid",                             # Line type, could be 0 or “blank”, 1 or “solid”, 2 or “dashed”, 3 or “dotted”, 4 or “dotdash”, 5 or “longdash”, 6 or “twodash”
      edge.curved=0.3,  
     
     layout=layout_in_circle) 
legend("right", legend=unique(V(net)$NetworkName), col=unique(V(net)$Color), pch=19, title= "Networks",cex=.8,
       inset=c(-0.1,0)) 

Look at networks in different layouts

layouts <- grep("^layout_", ls("package:igraph"), value=TRUE)[-1] 

# Remove layouts that do not apply to our graph.
layouts <- layouts[!grepl("bipartite|components|drl|kk|sugiyama", layouts)]

par(mfrow=c(3,3), mar=c(1,1,1,1))
for (layout in layouts) {
  #print(layout)
  l <- do.call(layout, list(net)) 
  plot(net, edge.arrow.mode=0, layout=l, main=layout,
       vertex.color=V(net)$Color,
       vertex.label=NA,
       vertex.size=igraph::degree(net) *10,
       edge.color=c("dark red", "slategrey")[as.numeric(E(net)$weight<0)+1],
       edge.width=abs(E(net)$weight)*10, 
       edge.curved=0.3) }

# select cols
net_links <- connectome_network %>%
  filter(connection_type=="Between") %>%
  dplyr::select(network_from, network_to, W, connection_type)
# rename cols
colnames(net_links) <- c("from", "to", "weight", "type")

# reformat colors
net_nodes <- power2011 %>% 
  mutate(name=factor(NetworkName), 
         colors=tolower(Color),
         colors=if_else(colors=="white", "wheat", colors),
         colors=if_else(colors=="teal", "tomato2", colors),
         colors=if_else(colors=="pale blue", "steelblue", colors),
         id=ifelse(Network==-1, 0, Network), 
         colors=factor(colors)) %>%
  distinct(id, name, colors) %>% as.data.frame() %>% 
  dplyr::select(id, name, colors) %>% arrange(id)
#levels(roi_nodes$Color) <- sample(colors(T), 14) 

# add a n for networkID
net_links <- net_links %>% mutate(from = paste("n", from, sep = ""), to = paste("n", to, sep = ""))
net_nodes <- net_nodes %>% mutate(id = paste("n", id, sep = ""))

# create a graph
nnet <- graph_from_data_frame(d=net_links, vertices=net_nodes, directed=FALSE)

This plot shows all network distribution. Red links represent positive W (Declarative networks), Gray links represent negative W(Procedural networks).

# plot
plot(nnet, 
      # === vertex
      vertex.color=V(nnet)$colors, 
      vertex.frame.color = "gray85",
      vertex.shape="circle",
      vertex.size=20,
      vertex.size2=NA,
     
      # === vertex label
      vertex.label=V(nnet)$name,                   # Character vector used to label the nodes
      vertex.label.color="black", #V(nnet)$colors,
      vertex.label.family="Arial",                  # Font family of the label (e.g.“Times”, “Helvetica”)
      vertex.label.font=2,                          # Font: 1 plain, 2 bold, 3, italic, 4 bold italic, 5 symbol
      vertex.label.cex=1,                           # Font size (multiplication factor, device-dependent)
      vertex.label.dist=2,                          # Distance between the label and the vertex
      vertex.label.degree=90,                        # The position of the label in relation to the vertex (use pi)
    
      # === Edge
      edge.color=c("dark red", "slategrey")[as.numeric(E(nnet)$weight<0)+1], # gray <0, red > 0                           # Edge color
      edge.width=abs(E(nnet)$weight)*20,            # Edge width, defaults to 1
      edge.arrow.size=1,                            # Arrow size, defaults to 1
      edge.arrow.width=NA,      # Arrow width, defaults to 1
      edge.lty="solid",                             # Line type, could be 0 or “blank”, 1 or “solid”, 2 or “dashed”, 3 or “dotted”, 4 or “dotdash”, 5 or “longdash”, 6 or “twodash”
      edge.curved=0.3,                              # Edge curvature, range 0-1 (FALSE sets it to 0, TRUE to 0.5)
     
     layout = layout.circle)
#legend(legend=V(nnet)$name, col=V(nnet)$colors,
#       text.col = "black", horiz = F , pt.cex=2, cex=1.2, bty="n", pch = 21, ncol=2,
#       omset=3)
legend("bottom", legend=V(nnet)$name, col=V(nnet)$colors, pch=19, title= "Networks", ncol = 2, cex=1)

Circlize

Look at positive W and negative W networks

library(circlize)
# reform power power2011
power_atals <- power2011 %>% 
  rename(ROI.Name = ROI, x.mni=X, y.mni=Y, z.mni=Z, network=NetworkName) %>% 
  mutate(ROI.Name=as.integer(ROI.Name), index = as.integer(ROI.Name),
         x.mni=as.integer(x.mni), y.mni=as.integer(y.mni), z.mni=as.numeric(z.mni)) %>%
  dplyr::select(ROI.Name, x.mni, y.mni, z.mni, network, index)


 
W_mat2 <- make_symmetric(connectom2Wmatrix(connectome.full))
rownames(W_mat2) = power_atals$network
colnames(W_mat2) = power_atals$network

colors14 = c(brewer.pal(name="Set1", n = 9), brewer.pal(name="Set2", n = 9))[1:14]

chordDiagram(W_mat2, directional = FALSE, transparency = 0.5, self.link = 1, symmetric = TRUE, scale = T, grid.col = colors14, col = ifelse(W_mat2>0, "tomato2", "white"))
  title("Network connections: Declarative (Positive W)")

chordDiagram(W_mat2, directional = FALSE, transparency = 0.5, self.link = 1, symmetric = TRUE, scale = T, 
             grid.col = colors14, col = ifelse(W_mat2<0, "steelblue", "white"))
title("Network connections: Procedural (Negative W)")

chordDiagram(W_mat2, directional = FALSE, transparency = 0.5, self.link = 1, symmetric = TRUE, scale = T, 
             grid.col = colors14, col = ifelse(W_mat2>0, "tomato2", "steelblue"))
             #annotationTrackHeight=mm_h(c(5, 2)))
title("Network connections: Full W")

circos.clear()

W_mat3 <- W_mat2
colnames(W_mat3) <- paste(colnames(W_mat2),1:264, sep = "")
rownames(W_mat3) <- paste(rownames(W_mat2),1:264, sep = "")
#W_mat3

nm = unique(unlist(dimnames(W_mat3)))
group = structure(gsub("\\d", "", nm), names = nm)
group = factor(group[sample(length(group), length(group))], levels = unique(power_atals$network))


#grid.col = structure(c(rep(2, 5), rep(3, 5), rep(4, 5)), names = c(paste0("A", 1:5), paste0("B", 1:5), paste0("C", 1:5)))
chordDiagram(W_mat3, directional = FALSE, transparency = .5, self.link = 2, symmetric = TRUE, scale = T, 
             col = ifelse(W_mat2>0, "tomato2", "steelblue"), annotationTrack = c("grid"),
             preAllocateTracks = list(track.height = mm_h(4), track.margin = c(mm_h(4), 0)))

circos.track(track.index = 2, panel.fun = function(x, y) {
    sector.index = get.cell.meta.data("sector.index")
    xlim = get.cell.meta.data("xlim")
    ylim = get.cell.meta.data("ylim")
    xplot = get.cell.meta.data("xplot")
    
    #print(sector.index)
    
    #circos.text(mean(xlim), 1, sector.index, niceFacing = TRUE, adj = c(-0.5, 0))
    circos.text(mean(xlim), mean(ylim), gsub(pattern = "\\D", "", sector.index), cex = .8, facing = "clockwise", niceFacing = TRUE, adj = c(-1, 0))
}, bg.border = NA)

### Network and node desciptives

Density

The proportion of present edges from all possible edges in the network.

net.d <- net - E(net)[E(net)$weight<0]
net.p <- net - E(net)[E(net)$weight>0]

data.frame("edge_density"=c(edge_density(net.d, loops=F), 
                            edge_density(net.p, loops=F),
                            edge_density(net, loops=F)), 
           "network" = factor(c("Declarative", "Procedural", "Full"), levels = c("Declarative", "Procedural", "Full"))) %>%
  ggplot(aes(x=network, y=edge_density, fill=network)) +
  geom_col(width = .5) +
  geom_text(aes(label=round(edge_density, 5)), position = position_dodge(width = 0.5), vjust = -0.5) +
  theme_pander()

Diameter

A network diameter is the longest geodesic distance (length of the shortest path between two nodes) in the network. In igraph, diameter() returns the distance, while get_diameter() returns the nodes along the first found path of that distance.

# make negative weights to positive
net.p.abs <- net.p
E(net.p.abs)$weight <- E(net.p.abs)$weight * (-1)

# make negative weights to positive
net.abs <- net
E(net.abs)$weight[E(net.abs)$weight<0] <- E(net.abs)$weight[E(net.abs)$weight<0] * (-1)


data.frame("diameter"=c(diameter(net.d, directed=F), 
                        diameter(net.p.abs, directed=F),
                        diameter(net.abs, directed=T)), 
           "network" = factor(c("Declarative", "Procedural", "Full"), levels = c("Declarative", "Procedural", "Full"))) %>%
  ggplot(aes(x=network, y=diameter, fill=network)) +
  geom_col(width = .5) +
  geom_text(aes(label=round(diameter, 2)), position = position_dodge(width = 0.5), vjust = -0.5) +
  theme_pander()

Centrality & centralization

Centrality functions (vertex level) and centralization functions (graph level). The centralization functions return res - vertex centrality, centralization, and theoretical_max - maximum centralization score for a graph of that size. The centrality function can run on a subset of nodes (set with the vids parameter). This is helpful for large graphs where calculating all centralities may be a resource-intensive and time-consuming task.

Centrality is a general term that relates to measures of a node’s position in the network. There are many such centrality measures, and it can be a daunting task to wade through all of the different ways to measure a node’s importance in the network. Here, we will introduce just a few examples.

Degree

Degree centrality is simply the number of edges connected to a given node. In a social network, this might mean the number of friends an individual has. We can calculate degree centrality with a simple function: degree()

data.frame("centr_degree"=c(centr_degree(net.d, normalized=T)$centralization, 
                        centr_degree(net.p, normalized=T)$centralization,
                        centr_degree(net, normalized=T)$centralization), 
           "network" = factor(c("Declarative", "Procedural", "Full"), levels = c("Declarative", "Procedural", "Full"))) %>%
  ggplot(aes(x=network, y=centr_degree, fill=network)) +
  geom_col(width = .5) +
  geom_text(aes(label=round(centr_degree, 4)), position = position_dodge(width = 0.5), vjust = -0.5) +
  theme_pander()

In weighted networks, we can also node strength, which is the sum of the weights of edges connected to the node. Let’s calculate node strength and plot the node sizes as proportional to these values.

Betweeness

Let’s now do the same for betweenness centrality, which is defined as the number of geodesic paths (shortest paths) that go through a given node. Nodes with high betweenness might be influential in a network if, for example, they capture the most amount of information flowing through the network because the information tends to flow through them. Here, we use the normalized version of betweenness.

Closeness (centrality based on distance to others in the graph) Inverse of the node’s average geodesic distance to others in the network.

data.frame("centr_clo"=c(centr_clo(net.d)$centralization, 
                        centr_clo(net.p)$centralization,
                        centr_clo(net)$centralization), 
           "network" = factor(c("Declarative", "Procedural", "Full"), levels = c("Declarative", "Procedural", "Full"))) %>%
  ggplot(aes(x=network, y=centr_clo, fill=network)) +
  geom_col(width = .5) +
  geom_text(aes(label=round(centr_clo, 8)), position = position_dodge(width = 0.5), vjust = -0.5) +
  theme_pander()

Distances

data.frame("mean_distance"=c(mean_distance(net.d, directed=F), 
                        mean_distance(net.p, directed=F),
                        mean_distance(net, directed=F)), 
           "network" = factor(c("Declarative", "Procedural", "Full"), levels = c("Declarative", "Procedural", "Full"))) %>%
  ggplot(aes(x=network, y=mean_distance, fill=network)) +
  geom_col(width = .5) +
  geom_text(aes(label=round(mean_distance, 4)), position = position_dodge(width = 0.5), vjust = -0.5) +
  theme_pander()

Assortativity

data.frame("assortativity_degree"=c(assortativity_degree(net.d, directed=F), 
                        assortativity_degree(net.p, directed=F),
                        assortativity_degree(net, directed=F)), 
           "network" = factor(c("Declarative", "Procedural", "Full"), levels = c("Declarative", "Procedural", "Full"))) %>%
  ggplot(aes(x=network, y=assortativity_degree, fill=network)) +
  geom_col(width = .5) +
  geom_text(aes(label=round(assortativity_degree, 4)), position = position_dodge(width = 0.5), vjust = -0.5) +
  theme_pander()

Hierarchical edge bundling

This package has some bugs.

  1. The weight of connection cannot be set
  2. The color of connection is not set by weight values(W)
# create a data frame giving the hierarchical structure of your individuals
d1 <- data.frame(from="origin", to=unique(power_atals$network))
d2 <- data.frame(from=power_atals$network, to=power_atals$index)
hierarchy <- rbind(d1, d2)

# create a dataframe with connection between leaves (individuals)
all_leaves <- seq(1,264)
connect <- connectome_network %>% mutate(from=connection_from, to=connection_to, weight=W) %>% dplyr::select(from, to, weight)

# connection from weights
cf <- connectome_network %>% 
  dplyr::select(connection_from, W) %>%
  right_join(data.frame(connection_from=as.character(c(1:264))), 
             by = "connection_from") %>%
  # some multiple connections
  group_by(connection_from) %>% 
  summarise(W = max(W)) %>%
  mutate(W = replace_na(W, 0))

# connection to weights
ct <- connectome_network %>% 
  dplyr::select(connection_to, W) %>%
  right_join(power_atals %>% mutate(connection_to=as.character(index)) %>%  dplyr::select(connection_to), by = "connection_to") %>%
  group_by(connection_to) %>% 
  summarise(W = max(W)) %>%
  mutate(W = replace_na(W, 0))


# create a vertices data.frame. One line per object of our hierarchy
vertices  <-  data.frame(
  name = unique(c(as.character(hierarchy$from), as.character(hierarchy$to))) , 
  degree = as.numeric(c(rep(NA, 15), igraph::degree(net))),
  weight_from = as.numeric(c(rep(NA, 15), cf$W)), 
  weight_to = as.numeric(c(rep(NA, 15), ct$W))
) 
# Let's add a column with the group of each name. It will be useful later to color points
vertices$group  <-  hierarchy$from[ match( vertices$name, hierarchy$to ) ]

#Let's add information concerning the label we are going to add: angle, horizontal adjustement and potential flip
#calculate the ANGLE of the labels
vertices$id <- NA
myleaves <- which(is.na( match(vertices$name, hierarchy$from) ))
nleaves <- length(myleaves)
vertices$id[ myleaves ] <- seq(1:nleaves)
vertices$angle <- 90 - 360 * vertices$id / nleaves
 
# calculate the alignment of labels: right or left
# If I am on the left part of the plot, my labels have currently an angle < -90
vertices$hjust <- ifelse( vertices$angle < -90, 1, 0)
 
# flip angle BY to make them readable
vertices$angle <- ifelse(vertices$angle < -90, vertices$angle+180, vertices$angle)

# T/F
vertices$show_connection <- vertices$id %in% union(connectome_network$connection_from, connectome_network$connection_to)
vertices <- vertices %>%  group_by(group) %>% mutate(rk = ifelse(group%in%unique(power_atals$network), order(id), -1))


# Create a graph object
mygraph <- graph_from_data_frame( hierarchy, vertices=vertices )

# The connection object must refer to the ids of the leaves:
from  <-  match( connect$from, vertices$name)
to  <-  match( connect$to, vertices$name)

ggraph(mygraph, layout = 'dendrogram', circular = TRUE) +
  # connection data
  geom_conn_bundle(data = get_con(from = from, to = to), width=2, alpha=0.6, aes(colour=..index..)) +
  scale_edge_colour_distiller(type = "seq", palette = "Paired", direction=1, na.value = "grey50") +

  # node features
  geom_node_point(aes(filter = leaf, x = x*1.07, y=y*1.07, colour=group, size=degree, alpha=0.2)) +
  scale_colour_manual(values=colors14)+
  scale_size_continuous( range = c(0.1,10)) + 
  #scale_colour_brewer(palette = "Set1") +
  
  # label features
  geom_node_text(aes(x=x*1.2, y=y*1.2, filter=show_connection, label=name), size=2, alpha=1, 
                 check_overlap = TRUE, show.legend = NA) +
  #geom_node_text(aes(x=x*1.15, y=y*1.5, label = group, angle=0, colour=group, filter=rk==1), 
  #               check_overlap = TRUE, show.legend = NA)+
  
  # theme
  theme_void() +
  ggtitle("Network W")

# create a dataframe with connection between leaves (individuals)
connect <- connectome_network %>% mutate(from=connection_from, to=connection_to, value=W) %>% dplyr::select(from, to, value)
 
# create a vertices data.frame. One line per object of our hierarchy
vertices  <-  data.frame(name = unique(c(as.character(hierarchy$from), as.character(hierarchy$to))),
                         degree = as.numeric(c(rep(NA, 15), igraph::degree(net)))) 
# Let's add a column with the group of each name. It will be useful later to color points
vertices$group  <-  hierarchy$from[ match( vertices$name, hierarchy$to ) ]
 
 
# Create a graph object
mygraph <- graph_from_data_frame( hierarchy, vertices=vertices )
 
# The connection object must refer to the ids of the leaves:
from  <-  match( connect$from, vertices$name)
to  <-  match( connect$to, vertices$name)
 
# Basic graph
ggraph(mygraph, layout = 'dendrogram', circular = TRUE) + 
  geom_conn_bundle(data = get_con(from = from, to = to), aes(colour=..index..), alpha=0.8, width=0.9, tension=0.9) +
  scale_edge_colour_distiller(palette = "RdPu") +
  geom_node_point(aes(filter = leaf, x = x*1.05, y=y*1.05, colour=group, size=degree, alpha=0.2)) +
  scale_colour_manual(values= rep( brewer.pal(9,"Paired") , 30)) +
  scale_size_continuous( range = c(0.1,10) ) +
  theme_void()